from __future__ import division
from operator import itemgetter, attrgetter
from struct import *
import gc
import math
import matplotlib
import os
import pylab
import random
import sys
import time

print "Begins..."

'''
docIDANDTrecIDDict = {}
# for moa
inputFileName0 = "/home/diaosi/gov2ClearYourMindAndDoItAgain2014/trecID_docID_MappingTableForGov2Dataset"
inputFileHandler = open(inputFileName0,"r")
currentLine = inputFileHandler.readline()
while currentLine:
    currentLineElements = currentLine.strip().split(" ")
    currentDocID = currentLineElements[1]
    currentTrecID = currentLineElements[0]
    if currentDocID not in docIDANDTrecIDDict:
        docIDANDTrecIDDict[currentDocID] = currentTrecID
    currentLine = inputFileHandler.readline()
print "len(docIDANDTrecIDDict):",len(docIDANDTrecIDDict)
print "docIDANDTrecIDDict['0']:",docIDANDTrecIDDict['0']
inputFileHandler.close()
'''

# 16234 1%
# 58892 3%
# 113151 5%
# 303078 10%
# 573053 15%
# 960415 20%
# 1704697 30%
# 3048106 40%
# 4840867 50%
# 7078881 60%
# 9726244 70%
# 13270243 80%
# 17993779 90%
# 25205179 100%

NUM_DOCUMENTS_NEED_TO_PROCESS = 16234

sumNumOfPostings = 0
docIDWithNumOfPostingsRecordedDict = {}
inputFileName1 = "/home/diaosi/workspace/web-search-engine-wei-2014-April/data/docID_num_of_postings_recorded_in_index_MappingTableForGov2Dataset_sortedByDocID"
inputFileHandler = open(inputFileName1,"r")
currentLine = inputFileHandler.readline()
while currentLine:
    currentLineElements = currentLine.strip().split(" ")
    currentDocID = currentLineElements[0]
    currentNumOfPostings = int(currentLineElements[1])
    sumNumOfPostings += currentNumOfPostings
    if currentDocID not in docIDWithNumOfPostingsRecordedDict:
        docIDWithNumOfPostingsRecordedDict[currentDocID] = currentNumOfPostings
    currentLine = inputFileHandler.readline()
inputFileHandler.close()
print "len(docIDWithNumOfPostingsRecordedDict):",len(docIDWithNumOfPostingsRecordedDict)
print "sumNumOfPostings:",sumNumOfPostings

numOfDocumentsProcessed = 0
sumNumOfPostings = 0
inputFileName2 = "/home/diaosi/gov2ClearYourMindAndDoItAgain2014/gov2_Docs_with_TheirXdocValues_Since20140428_sortedByXdocValues"
inputFileHandler = open(inputFileName2,"r")
currentLine = inputFileHandler.readline()
while currentLine:
    currentLineElements = currentLine.strip().split(" ")
    currentDocID = currentLineElements[1]
    sumNumOfPostings += docIDWithNumOfPostingsRecordedDict[currentDocID]
    numOfDocumentsProcessed += 1
    if NUM_DOCUMENTS_NEED_TO_PROCESS == numOfDocumentsProcessed:
        break
    currentLine = inputFileHandler.readline()
inputFileHandler.close()
print "sumNumOfPostings:",sumNumOfPostings
print "numOfDocumentsProcessed:",numOfDocumentsProcessed

print "OVERALL:"
# print "inputFileName0:",inputFileName0
print "inputFileName1:",inputFileName1
print "inputFileName2:",inputFileName2
print "Ends."
